majors_processed %>%
count(Major_category, sort = T)
## # A tibble: 16 x 2
## Major_category n
## <chr> <int>
## 1 Engineering 29
## 2 Education 16
## 3 Humanities & Liberal Arts 15
## 4 Biology & Life Science 14
## 5 Business 13
## 6 Health 12
## 7 Computers & Mathematics 11
## 8 Agriculture & Natural Resources 10
## 9 Physical Sciences 10
## 10 Psychology & Social Work 9
## 11 Social Science 9
## 12 Arts 8
## 13 Industrial Arts & Consumer Services 7
## 14 Law & Public Policy 5
## 15 Communications & Journalism 4
## 16 Interdisciplinary 1
by_major_category <- majors_processed %>%
filter(!is.na(Total)) %>%
group_by(Major_category) %>%
summarize(Men = sum(Men),
Women = sum(Women),
Total = sum(Total),
MedianSalary = sum(Median*Sample_size)/sum(Sample_size)) %>%
mutate(ShareWomen = Women / Total) %>%
arrange(desc(ShareWomen))
majors_processed %>%
ggplot(aes(Median)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggrepel)
majors_processed %>%
mutate(Major_category=fct_reorder(Major_category,ShareWomen)) %>%
ggplot(aes(fct_lump(Major_category, 20), ShareWomen, fill =
Major_category),
label=ShareWomen) +
geom_boxplot(show.legend = F) +
coord_flip() +
theme_bw() +
geom_hline(yintercept = 0.153, lty = 2)
majors_processed %>% # BOXPLOT COMPARING EARNINGS ACROSS MAJOR CATEGORY
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category, Median,fill=Major_category)) +
geom_boxplot(show.legend = F) +
coord_flip() +
scale_y_log10(labels = scales::dollar_format())
majors_processed %>%
group_by(Major_category) %>%
summarise(Median=median(Median)) %>%
mutate(Major_category = fct_reorder(Major_category, Median)) %>%
ggplot(aes(Major_category,Median,fill=Major_category))+
geom_col(show.legend = FALSE)+
coord_flip()
majors_processed %>%
arrange(desc(Median)) %>%
select(Major,Major_category,Median,P25th,P75th) %>%
head(20) %>%
mutate(Major=fct_reorder(Major,Median)) %>%
ggplot(aes(Major,Median,colour=Major_category))+
geom_point()+
coord_flip()+
geom_errorbar(aes(ymin=P25th,ymax=P75th))+
expand_limits(y=0)
majors_processed %>%
filter(Major_category=='Business') %>%
mutate(Major=fct_reorder(Major,Median)) %>%
ggplot(aes(Major,Median,fill=Major)) +
geom_col(show.legend=FALSE)+
coord_flip()
majors_processed %>%
filter(Major_category=='Business') %>%
mutate(Major=fct_reorder(Major,Median)) %>%
ggplot(aes(Major,Median,fill=Major)) +
geom_col(show.legend=FALSE)+
coord_flip()
### The greatest sausage fest majors
library(ggrepel)
majors_processed %>%
arrange(ShareWomen) %>%
head(20) %>%
mutate(Major=fct_reorder(Major,ShareWomen)) %>%
ggplot(aes(Major,ShareWomen))+
geom_point()+
coord_flip()
### Mapping share of women and median earnings per major
majors_processed %>%
filter(Sample_size>30) %>%
ggplot(aes(desc(ShareWomen),Median))+
geom_point()+
scale_y_log10(labels=scales::dollar_format())+
geom_smooth(method='lm')+
geom_text_repel(aes(label=Sample_size,alpha=0.2))+
theme_bw()
summary(lm(Median~ShareWomen,recent_grads))
##
## Call:
## lm(formula = Median ~ ShareWomen, data = recent_grads)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17261 -5474 -1007 3502 57604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56093 1705 32.90 <2e-16 ***
## ShareWomen -30670 2987 -10.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9031 on 170 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3828, Adjusted R-squared: 0.3791
## F-statistic: 105.4 on 1 and 170 DF, p-value: < 2.2e-16
majors_processed %>%
mutate(Major=fct_reorder(Major,Total)) %>%
arrange(desc(Total)) %>%
head(20) %>%
ggplot(aes(Major,Total,fill=Major))+
geom_col()+
scale_y_continuous(label=scales::comma_format())+
coord_flip()
majors_processed %>%
arrange(desc(Total)) %>%
head(20) %>%
mutate(Major=fct_reorder(Major, Total)) %>%
gather(Gender, Number, Men, Women) %>%
ggplot(aes(Major, Number, fill = Gender))+
geom_col()+
scale_y_continuous(label=scales::comma_format())+
coord_flip()
by_major_category %>%
ggplot(aes(ShareWomen,MedianSalary))+
geom_point()+
geom_smooth(method="lm")+
geom_text_repel(aes(label=Major_category),force=0.2)+
expand_limits(y=0)
library(plotly)
g <- majors_processed %>%
mutate(Major_category=fct_lump(Major_category,7)) %>%
ggplot(aes(ShareWomen,Median, colour= Major_category,size=Sample_size))+
geom_point(aes(label=Major))+
geom_smooth(aes(group=1),method="lm")+
expand_limits(y=0)+
scale_y_continuous(labels=scales::dollar_format())+
scale_x_continuous(labels=scales::percent_format())
ggplotly(g)
majors_processed %>%
select(Major, Total, ShareWomen, Sample_size, Median) %>%
lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>%
summary()
##
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -260500 -61042 -13899 33262 865081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52073 1436 36.255 <2e-16 ***
## ShareWomen -23650 2403 -9.842 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123000 on 170 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3592
## F-statistic: 96.87 on 1 and 170 DF, p-value: < 2.2e-16
library(broom)
majors_processed %>%
select(Major, Major_category, Total, ShareWomen, Sample_size, Median) %>%
add_count(Major_category) %>%
filter(n>=10) %>%
nest(-Major_category) %>%
mutate(model = map(data, ~ lm(Median ~ ShareWomen, data = ., weights = Sample_size)),tidied=map(model,tidy)) %>%
unnest(tidied) %>%
filter(term == "ShareWomen") %>%
arrange(estimate) %>%
mutate(fdr = p.adjust(p.value,method="fdr"))
## # A tibble: 9 x 9
## Major_category data model term estimate std.error statistic p.value fdr
## <chr> <list> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Biology & Life~ <tibb~ <lm> Shar~ -43735. 20982. -2.08 0.0592 0.106
## 2 Engineering <tibb~ <lm> Shar~ -33912. 15418. -2.20 0.0366 0.0937
## 3 Computers & Ma~ <tibb~ <lm> Shar~ -28694. 18552. -1.55 0.156 0.235
## 4 Business <tibb~ <lm> Shar~ -28171. 9810. -2.87 0.0152 0.0937
## 5 Agriculture & ~ <tibb~ <lm> Shar~ -16263. 5975. -2.72 0.0297 0.0937
## 6 Physical Scien~ <tibb~ <lm> Shar~ -12820. 13349. -0.960 0.365 0.469
## 7 Education <tibb~ <lm> Shar~ -1996. 3084. -0.647 0.528 0.594
## 8 Humanities & L~ <tibb~ <lm> Shar~ -1814. 4128. -0.439 0.668 0.668
## 9 Health <tibb~ <lm> Shar~ 54721. 23427. 2.34 0.0416 0.0937